/*******************************************************************************
* This program combines all variables used in Hirshleifer, Ling, and Wang (2024)
  to create the final sample; it joins the three datasets:
  -- sample_loc, created via 'SUE Sample Location Related.sas', which includes:
      - Facebook SCI
      - Urban/rural indicator

  -- sample_permno, which includes:
      - Various CARs: DGTW (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Abnormal Tradding Volume (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Institutional Ownership (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Fama French 10/48 classification (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Idiosyncratic Volatility (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Volatility Persistence (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Dollar Volume Persistence (via './DepVars/3-CAR-VOL-Persistence.sas')
      - S&P 500 indices (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Advertisement Expenses (via './DepVars/3-CAR-VOL-Persistence.sas')
      - Historical NAICS (via './DepVars/3-CAR-VOL-Persistence.sas')
      - StockTwits Measures and Centralities (via './DepVars/4-StockTwits.sas')
      - Google trend (via './DepVars/5-GoogleSVI.sas')

  -- sample_earnings, which inlcudes:
      - SUE (updated via './DepVars/1-sue.sas')
      - Earings Volatility (via './DepVars/1-sue.sas')
      - Earnings Persistence ( via './DepVars/1-sue.sas')
      - Size (via '.DepVars/1-sue.sas')
      - BM ratio (via '.DepVars/1-sue.sas')
      - Reporting Lag (via '.DepVars/1-sue.sas');

  -- sample_loc, which includes:
      - Facebook CEN
      - Urban/rural indicator
      - County-level demographical controls (extrapolated/backfilled to 1990-2017)
          -- mean_age_ct
          -- retire_ratio_ct
          -- pop_ct
          -- mean_edu_ctbk/mean_edu_ctex
          -- ratio_clgplus_ctbk/ratio_clgplus_ctex
          -- movedin_ctbk/movedin_ctex
          -- median_income_ctex/median_income_ctbk
          --- suffix _ct means county-level, bk means missing value are backfilled
              ex means missing values are extrapolated using sas proc expand;

  In addition, this program also creates the following variables:
      - Same ind workforce
********************************************************************************/

/********************************************************************************
                        Sample Earnigns: sample_earnings
********************************************************************************/
** Creates sample_earnings;
data sample_earnings; set rep.earnings; run;

/********************************************************************************
                      Sample Stock Realted: sample_permno
********************************************************************************/
* Creating Permno-rdq1 pairs from SUE sample;
proc sql;
  create table permno_rdq1
  as select distinct permno, gvkey, fyearq, fqtr, rdq1
  from rep.earnings
  order by permno, rdq1;
  quit; 

* Join CARs, Abnormal Volumes, and Persistences of Volatility and Volumes;
* Join with Google SVI;
* Join with StockTwits;
data sample1;
  merge permno_rdq1 
        rep.car_vol
        rep.svi
        rep.stocktwits;
  by permno rdq1;
  if permno ne .;
  if rdq1 ne .;
  run;

proc sort data = sample1 nodupkey out =  sample2; by permno rdq1; run; 

* Join IO, Industry Classification, Idvol, S&P500 Index;
proc sql;
  * IO;
  create table sample3
  as select a.*, b.ior
  from sample2 as a left join rep.io_timeseries_2018 as b
  on a.permno = b.permno and intnx('quarter', a.rdq1, -1, 'E') = b.rdate;

  * match with sic code and exchcd;
  * will match using crsp siccd;
  * crsp_common_sic is upto 2018;
  create table sample4
  as select a.*, b.ff10, b.exchcd 
  from sample3 as a left join rep.crsp_common_sic as b
  on a.permno = b.permno and b.namedt<=a.rdq1<= b.nameendt;

  *rep.idvol_2018 is upto 2018;
  create table sample5
  as select a.*, b.idvol, b.nused as idvol_num
  from sample4 as a left join rep.idvol_2018 as b
  on a.permno = b.permno and intnx('month', a.rdq1, -1, 'E') = b.monthend;


  * S&P 500 dummy;
  create table sample6
  as select distinct a.*, 
    case
      when b.start<= a.rdq1 <= b.ending then 1
      else 0
    end as SP500
  from sample5 as a left join rep.sp500list b
  on a.permno = b.permno;

  * Advertisement Expenses;
  create table sample7
  as select distinct a.*, b.xad
  FROM sample6 AS a LEFT JOIN rep.xad AS b
  ON a.gvkey = b.gvkey and a.fyearq = b.fyear-1;

  * Link with NAICS codes;
  * Also match to previous year's info;
  create table sample8
  as select distinct a.*, b.ind
  from sample8 as a left join rep.naics_2018 as b
  on a.gvkey = b.gvkey and intnx('year', a.rdq1, -1, 'E') = b.year_date;

quit;

* Output;
data sample_permno;
  set sample8;
  run; 
* Housing Clearing; 
proc datasets lib=work;
    delete sample0-sample8 permno_rdq1;
quit;

/********************************************************************************
                      Sample Location Realted: sample_loc
********************************************************************************/
* Import facebook CEN measures;
  PROC IMPORT OUT= WORK.CEN
    DATAFILE= "../data/centrality.csv" /* adjsut the path depending on your os */
              DBMS=CSV REPLACE;
       GETNAMES=YES;
       DATAROW=2; 
  RUN;

* Link CEN to headquater addresses;
  proc sql;
    create table link1
    as select distinct a.*, b.*
    from rep.combined_sci_2018 (drop = latitude longitude) as a, 
         cen as b
    where a.own_county_id = b.county
    order by cik, beg_date;
    quit; 
  proc sort data = link1 nodupkey; by cik beg_date; run ; 

* Link local demographics to headquater addresses;

proc sql;
  create table link2
  as select distinct a.*, b.*
  from main3 as a left join rep.County_ctrl_2018 as b
  on year(a.rdq1) = b.year and a.county = b.fips;
  quit;
proc sort data = link2; by cik beg_date; run; 

* extend the first beg_date and the last end_date for each cik;
data sample_loc;  
  set link5;
  by cik beg_date;
  altbeg_date = beg_date;
  altend_date = end_date;
  if first.cik then altbeg_date = mdy(1,1,1963);
  if last.cik then altend_date = mdy(9,16,2018);
  format altbeg_date date9. altend_date date9.;
  run; 

/********************************************************************************
                                Combine and Save
********************************************************************************/


* Join three datasets;
proc sql;
  create table main0
  as select distinct a.*, b.*
  from sample_permno(where = (permno ne . and rdq1 ne .)) as a,
       sample_earnings(where = (permno ne . and rdq1 ne .)) as b
  where a.permno = b.permno and a.rdq1 = b.rdq1;

  * Match with locations;
  * Uses the extended beg_date and end_date to merge;
  * Restrictions can be later set on by requiring beg_date<rdq1<end_date;
  create table main1
  as select distinct a.*, b.*
  from main0 (where= (missing(cik) = 0)) as a , sample_loc as b
  where input(a.cik, 10.) = b.cik and b.altbeg_date <= a.rdq1 <b.altend_date
  order by a.gvkey, a.fyearq, a.fqtr;

  quit;

* Create same ind workforce;
proc sql;
  create table main2
    as select *,
    case when ind = 11 then nasci11_ctex/nasci00_ctex
      when ind = 21 then nasci21_ctex/nasci00_ctex
      when ind = 22 then nasci22_ctex/nasci00_ctex
      when ind = 23 then nasci23_ctex/nasci00_ctex
      when 31<= ind <=33 then nasci31_ctex/nasci00_ctex
      when ind = 42 then nasci42_ctex/nasci00_ctex
      when 44<= ind <=45 then nasci44_ctex/nasci00_ctex
      when 48<= ind <=49 then nasci48_ctex/nasci00_ctex
      when ind = 51 then nasci51_ctex/nasci00_ctex
      when ind = 52 then nasci52_ctex/nasci00_ctex
      when ind = 53 then nasci53_ctex/nasci00_ctex
      when ind = 61 then nasci61_ctex/nasci00_ctex
      when ind = 62 then nasci62_ctex/nasci00_ctex
      when ind = 71 then nasci71_ctex/nasci00_ctex
      when ind = 72 then nasci72_ctex/nasci00_ctex
      when ind = 81 then nasci81_ctex/nasci00_ctex
      when ind = 92 then nasci92_ctex/nasci00_ctex
  end as sind_p_ctex label = 'Percent of workforce, same industry',
  calculated sind_p_ctex*pop_ct_25plus as sind_n_ctex label = 'Number of workforce, same industry'
  from main3_1;
  quit;

proc sort data = main2 nodupkey; by gvkey rdq1; run;

* Expor to Stata;

PROC EXPORT DATA= WORK.main2
            OUTFILE= "../data/sue_sample.dta"
            DBMS=STATA REPLACE;
      RUN;
